%matplotlib inline
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
# change default figsize
plt.rcParams['figure.figsize'] = (15, 12)
Rivers/Streams Data Source: The National Map Small-Scale website
This workbook requires the shapefiles found here.
%%time
rivers_raw = gpd.read_file('data/streaml010g/streaml010g.shp')
%%time
us_states = gpd.read_file('http://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_state_20m.zip')
print('Rivers CRS =', rivers_raw.crs)
print('US States CRS =', us_states.crs)
rivers_raw.head()
us_states.head()
rivers_raw.groupby('Feature').agg(['count', 'size', 'nunique']).stack()
The most incomplete data seems to be the Name of the stream/river.
plt.rcParams['lines.linewidth']
1.5 is a little too big for all the rivers. Alter linewidth with a new variable called line_width before plotting.
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#448ee4'
line_width = 0.3
us_states.plot(ax=ax, edgecolor='gray', alpha=0.5, color=base_color)
rivers_raw.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
#
ax.set(xlim=(-126,-65), ylim=(24,50));
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.5
us_states.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_raw.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
ax.set(xlim=(-90,-81.8), ylim=(36,39.5));
print('\n\nThere are',rivers_raw['Feature'].nunique(),'unique Features in the dataframe.')
print('And',rivers_raw['F_Code'].nunique(),'unique F_Code in the dataframe.')
print('\nCounts of each Feature')
print('----------------------')
rivers_raw['Feature'].value_counts()
Feature values with corrsponding F_Code¶unique_fcode = rivers_raw['Feature'].unique()
for index, code in enumerate(unique_fcode):
print(unique_fcode[index], rivers_raw['F_Code'].loc[rivers_raw['Feature'] == unique_fcode[index]].unique())
State column in the rivers dataframe contains more than expected¶this is due to the rivers being shared with other states
print(sorted(rivers_raw['State'].unique())) # as the output below shows the rivers can be shared by states
state_df = us_states.loc[us_states['STUSPS'] == 'KY']
Use the contains method with rivers
this gets the "shared" portion of the rivers
rivers_df = rivers_raw.loc[rivers_raw['State'].str.contains('KY')]
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.7
state_df.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_df.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
rivers_df_length = rivers_df.groupby(['Name'])['Length_mi'].agg('sum')
rivers_df_length.sort_values(ascending=False).nlargest(20)
longest_rivers_name = rivers_df.groupby('Name')['Length_mi'].agg('sum').sort_values(ascending=False).nlargest(10).index
longest_rivers_name
longest_rivers = rivers_df.loc[rivers_df['Name'].isin(longest_rivers_name)]
longest_rivers.head()
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.7
longriver_color = "#f44262"
state_df.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_df.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
longest_rivers.plot(ax=ax, color=longriver_color, linewidth=line_width, zorder=2);
ax.set(xlim=(-90,-81.8), ylim=(36,39.5));
from data.world
point data drawn from the EPA's toxic release inventory program
tri = pd.read_csv('https://query.data.world/s/3b3oi57gti4qhoexmg74sdc3ftz2te', index_col='FID')
tri.head()
tri.info()
tri['HUC8_CODE'].unique()
from shapely.geometry import Point
# create a Series of geometries using the point locations in the dataframe (tri)
geoms = [Point(xy) for xy in zip(tri.X, tri.Y)]
crs = {'init' :'epsg:4326'}
# use GeoPandas to create a GeoDataFrame and assign a CRS
tri_geo = gpd.GeoDataFrame(tri, crs=crs, geometry=geoms)
tri_geo.crs
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.7
us_states.plot(ax=ax, edgecolor='white', color='#f2f2f2', zorder=0);
rivers_raw.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
tri_geo.plot(ax=ax, color='orange', zorder=2, markersize=.04);
ax.set(xlim=(-128,-65), ylim=(23,51))
plt.title('EPA Toxic Sites Plotted Over US Streams/Rivers', fontsize=20, color='#295b97');
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.7
marker_size = 2
state_df.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_df.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
tri_geo.plot(ax=ax, color='orange', zorder=2, markersize=marker_size);
ax.set(xlim=(-90,-81.8), ylim=(36,39.5)); # limit the frame to the area of state geodataframe
the mask is used to eliminate points outside the state borders
state_df.geom_type.unique()
rivers_df.geom_type.unique()
using the Shapely .unary_union method will return a single Shapely Polygon feature
state_poly = state_df.geometry.unary_union
# create new GeoDataFrame of points that intersect with the clipping polygon
state_toxic_places = tri_geo[tri_geo.geometry.intersects(state_poly)]
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.7
state_df.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_df.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
state_toxic_places.plot(ax=ax, color='orange', zorder=2, markersize=2);
ax.set(xlim=(-90,-81.8), ylim=(36,39.5));
goal is to identify stream possibly impacted my toxic places
To solve this, perform some coordinate operations on the datasets and project them to an equidistant conic projection. Use a modified Proj4 string adjusted to center the meridian on the US and minimize distortion between two standard parallels:
state_eqdc = state_df.to_crs('+proj=eqdc +lat_0=39 +lon_0=-96 +lat_1=33 +lat_2=45 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs')
tri_eqdc = state_toxic_places.to_crs('+proj=eqdc +lat_0=39 +lon_0=-96 +lat_1=33 +lat_2=45 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs')
rivers_eqdc = rivers_df.to_crs('+proj=eqdc +lat_0=39 +lon_0=-96 +lat_1=33 +lat_2=45 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs ')
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.7
state_eqdc.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_eqdc.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
tri_eqdc.plot(ax=ax, color='orange', zorder=2, markersize=2);
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 2
state_eqdc.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_eqdc.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
tri_eqdc.plot(ax=ax, color='orange', zorder=2, alpha=0.8, markersize=.5);
ax.set(xlim=(790000,750000), ylim=(-210000,-190000));
GeoPandas .buffer() method to create a new column in the existing GeoDataFrame¶the result will be in meters
tri_eqdc['buffer'] = tri_eqdc.buffer(500)
# note they are now polygons in meters
tri_eqdc['buffer'].head()
print('The current geometry for plots: ' + tri_eqdc.geometry.name)
# set the buffer col as the active geometry
tri_eqdc = tri_eqdc.set_geometry('buffer')
# now the geometry is the buffer column
print('The new geometry for plots: ' + tri_eqdc.geometry.name)
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.7
marker_size = 1
state_eqdc.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_eqdc.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
tri_eqdc.plot(ax=ax, color='orange', zorder=2, markersize=marker_size);
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.7
marker_size = 0.08
state_eqdc.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_eqdc.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
tri_eqdc.plot(ax=ax, color='orange', zorder=2, alpha=0.4, markersize=marker_size);
ax.set(xlim=(790000,750000), ylim=(-210000,-190000));
gpd.sjoin(line_gdf, poly_gdf, op='intersects')
tri_stream_df = gpd.sjoin(tri_eqdc, rivers_eqdc, op='intersects')
tri_stream_df.crs
tri_stream_df.head()
streamID = tri_stream_df['Stream_ID'].astype(str)
streamFromNode = tri_stream_df['From_node'].astype(str)
streamToNode = tri_stream_df['To_node'].astype(str)
combinecols = streamID + streamFromNode + streamToNode
tri_stream_df['master_ID']=combinecols
tri_stream_df['master_ID'].head()
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.9
marker_size = 4
state_eqdc.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_eqdc.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
tri_stream_df.plot(ax=ax, color='orange', zorder=2, markersize=marker_size);
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.9
marker_size = 0.008
state_eqdc.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_eqdc.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
tri_stream_df.plot(ax=ax, color='orange', zorder=2, alpha=0.8, markersize=marker_size);
ax.set(xlim=(790000,750000), ylim=(-210000,-190000));
print('\nBefore the intersect the dataframe tri_eqdc had', tri_eqdc.shape[0], 'rows and', tri_eqdc.shape[1], 'columns\n')
print('The new intersected tri_streams_df has', tri_stream_df.shape[0], 'rows and', tri_stream_df.shape[1], 'columns' )
print(len(tri_stream_df['Name'].unique()), 'unique stream/rivers names have an intersection with a toxic inventory site')
tri_streams_IndexValList = tri_stream_df.index
tri_streams_NameList = tri_stream_df['Name'].tolist()
tri_streams_FromNodeList = tri_stream_df['From_node'].tolist()
tri_streams_ToNodeList = tri_stream_df['To_node'].tolist()
tri_streams_Stream_ID = tri_stream_df['Stream_ID'].tolist()
# list of lists (tuple)
tri_streams=[]
for index in range(len(tri_streams_IndexValList)):
tri_streams.append([tri_streams_Stream_ID[index], tri_streams_FromNodeList[index],
tri_streams_ToNodeList[index]])
# One list
tri_streams=[]
for index in range(len(tri_streams_IndexValList)):
master_ID = str(tri_streams_Stream_ID[index]) + str(tri_streams_FromNodeList[index]) + str(tri_streams_ToNodeList[index])
tri_streams.append(master_ID)
# works for list of lists (tuple)
#tri_streams_unique=[]
#tri_streams_unique = [list(x) for x in set(tuple(x) for x in tri_streams)]
tri_streams_unique=[]
tri_streams_unique = [list(x) for x in set(x for x in tri_streams)]
print('The list count with duplicates: ', len(tri_streams))
print('The list count without duplicates: ',len((tri_streams_unique)))
cols = ['Stream_ID', 'From_node', 'To_node']
rivers_eqdc[cols].head()
streamID = rivers_eqdc['Stream_ID'].astype(str)
streamFromNode = rivers_eqdc['From_node'].astype(str)
streamToNode = rivers_eqdc['To_node'].astype(str)
combinecols = streamID + streamFromNode + streamToNode
rivers_eqdc['master_ID']=combinecols
rivers_eqdc['master_ID'].head()
toxic_rivers = rivers_eqdc.loc[rivers_eqdc['master_ID'].isin(tri_streams)]
cols = ['Name','Stream_ID', 'From_node', 'To_node', 'master_ID']
toxic_rivers[cols].head()
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.9
marker_size = 0.008
state_eqdc.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_eqdc.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
tri_stream_df.plot(ax=ax, color='orange', zorder=2, alpha=0.8, markersize=marker_size);
toxic_rivers.plot(ax=ax, color='red', linewidth=line_width, zorder=3);
ax.set(xlim=(750000,1090000), ylim=(-110500,55000));
This function takes a state and highlights its' longest rivers in a plot
def one_state(state_val):
state_df = us_states.loc[us_states['STUSPS'] == state_val]
rivers_df = rivers_raw.loc[rivers_raw['State'].str.contains(state_val)]
longest_rivers_name = rivers_df.groupby('Name')['Length_mi'].agg('sum').sort_values(ascending=False).nlargest(10).index
print(longest_rivers_name)
longest_rivers_df = rivers_df.loc[rivers_df['Name'].isin(longest_rivers_name)]
fig, ax = plt.subplots()
base_color = '#f0f0f0'
border_color = base_color
line_color = '#1c68c0'
line_width = 0.7
longriver_color = "#f44262"
state_df.plot(ax=ax, edgecolor='gray', alpha=0.7, color=base_color)
rivers_df.plot(ax=ax, color=line_color, linewidth=line_width, zorder=1);
longest_rivers_df.plot(ax=ax, color=longriver_color, linewidth=line_width, zorder=2);
#ax.set(xlim=(-90,-81.8), ylim=(36,39.5));
#return state_df, rivers_df, longest_rivers_df
xyz = one_state('UT')